In [1]:
import random
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

%matplotlib inline

| Import Data¶

In [2]:
df = pd.read_csv("../data/raw/Employee-Attrition.csv")
df.head()
Out[2]:
Age Attrition BusinessTravel DailyRate Department DistanceFromHome Education EducationField EmployeeCount EmployeeNumber ... RelationshipSatisfaction StandardHours StockOptionLevel TotalWorkingYears TrainingTimesLastYear WorkLifeBalance YearsAtCompany YearsInCurrentRole YearsSinceLastPromotion YearsWithCurrManager
0 41 Yes Travel_Rarely 1102 Sales 1 2 Life Sciences 1 1 ... 1 80 0 8 0 1 6 4 0 5
1 49 No Travel_Frequently 279 Research & Development 8 1 Life Sciences 1 2 ... 4 80 1 10 3 3 10 7 1 7
2 37 Yes Travel_Rarely 1373 Research & Development 2 2 Other 1 4 ... 2 80 0 7 3 3 0 0 0 0
3 33 No Travel_Frequently 1392 Research & Development 3 4 Life Sciences 1 5 ... 3 80 0 8 3 3 8 7 3 0
4 27 No Travel_Rarely 591 Research & Development 2 1 Medical 1 7 ... 4 80 1 6 3 3 2 2 2 2

5 rows × 35 columns

In [3]:
df.shape
Out[3]:
(1470, 35)

| Data Wrangling¶

In [4]:
def generate_meta_signals(df):
    meta_df = df.isna().sum().reset_index()
    meta_df['% of Total Values'] = meta_df[0]/len(df)
    meta_df = pd.concat([meta_df, pd.DataFrame(df.dtypes, columns=['dtypes']).reset_index(drop=True)], axis=1)
    d = df.describe().T.reset_index()
    meta_df = meta_df.merge(d, on=['index'], how='left')
    vcs = [pd.DataFrame(df[x].value_counts(normalize=True)).T for x in 
    list(df.columns)]
    vcs= [pd.DataFrame((x.idxmax(axis=1), x.max(axis=1))).T.reset_index() for x in vcs if len(list(x.columns)) > 0]
    meta_df = meta_df.merge(pd.concat(vcs), on=['index'], how='left') 
    meta_df.columns = ['Index', 'Missing Values', '% of Total Values', 'dtypes', 'count', 'mean', 'std', 'min', '25%', '50%', '75%', 'max', 'Most Frequently Value','% of MF Values']
    return meta_df
In [5]:
def bad_flag(row):
    if row['std'] == 0:
        return True
#     elif (row['25%'] == row['50%']) and (row['50%'] == row['75%']):
#         return True
    elif row['% of Total Values'] > .15:
        return True
    elif row['% of MF Values'] > .80:
        return True
    elif pd.isnull(row['% of MF Values']):
        return True
    else:
        return False
In [6]:
meta_df = generate_meta_signals(df)
    
meta_df['is Trash var'] = meta_df.apply(lambda r: bad_flag(r), axis=1)
meta_df.head(35)
Out[6]:
Index Missing Values % of Total Values dtypes count mean std min 25% 50% 75% max Most Frequently Value % of MF Values is Trash var
0 Age 0 0.0 int64 1470.0 36.923810 9.135373 18.0 30.00 36.0 43.00 60.0 35.0 0.053061 False
1 Attrition 0 0.0 object NaN NaN NaN NaN NaN NaN NaN NaN No 0.838776 True
2 BusinessTravel 0 0.0 object NaN NaN NaN NaN NaN NaN NaN NaN Travel_Rarely 0.709524 False
3 DailyRate 0 0.0 int64 1470.0 802.485714 403.509100 102.0 465.00 802.0 1157.00 1499.0 691.0 0.004082 False
4 Department 0 0.0 object NaN NaN NaN NaN NaN NaN NaN NaN Research & Development 0.653741 False
5 DistanceFromHome 0 0.0 int64 1470.0 9.192517 8.106864 1.0 2.00 7.0 14.00 29.0 2.0 0.143537 False
6 Education 0 0.0 int64 1470.0 2.912925 1.024165 1.0 2.00 3.0 4.00 5.0 3.0 0.389116 False
7 EducationField 0 0.0 object NaN NaN NaN NaN NaN NaN NaN NaN Life Sciences 0.412245 False
8 EmployeeCount 0 0.0 int64 1470.0 1.000000 0.000000 1.0 1.00 1.0 1.00 1.0 1.0 1.0 True
9 EmployeeNumber 0 0.0 int64 1470.0 1024.865306 602.024335 1.0 491.25 1020.5 1555.75 2068.0 2048.0 0.00068 False
10 EnvironmentSatisfaction 0 0.0 int64 1470.0 2.721769 1.093082 1.0 2.00 3.0 4.00 4.0 3.0 0.308163 False
11 Gender 0 0.0 object NaN NaN NaN NaN NaN NaN NaN NaN Male 0.6 False
12 HourlyRate 0 0.0 int64 1470.0 65.891156 20.329428 30.0 48.00 66.0 83.75 100.0 66.0 0.019728 False
13 JobInvolvement 0 0.0 int64 1470.0 2.729932 0.711561 1.0 2.00 3.0 3.00 4.0 3.0 0.590476 False
14 JobLevel 0 0.0 int64 1470.0 2.063946 1.106940 1.0 1.00 2.0 3.00 5.0 1.0 0.369388 False
15 JobRole 0 0.0 object NaN NaN NaN NaN NaN NaN NaN NaN Sales Executive 0.221769 False
16 JobSatisfaction 0 0.0 int64 1470.0 2.728571 1.102846 1.0 2.00 3.0 4.00 4.0 4.0 0.312245 False
17 MaritalStatus 0 0.0 object NaN NaN NaN NaN NaN NaN NaN NaN Married 0.457823 False
18 MonthlyIncome 0 0.0 int64 1470.0 6502.931293 4707.956783 1009.0 2911.00 4919.0 8379.00 19999.0 2342.0 0.002721 False
19 MonthlyRate 0 0.0 int64 1470.0 14313.103401 7117.786044 2094.0 8047.00 14235.5 20461.50 26999.0 4223.0 0.002041 False
20 NumCompaniesWorked 0 0.0 int64 1470.0 2.693197 2.498009 0.0 1.00 2.0 4.00 9.0 1.0 0.354422 False
21 Over18 0 0.0 object NaN NaN NaN NaN NaN NaN NaN NaN Y 1.0 True
22 OverTime 0 0.0 object NaN NaN NaN NaN NaN NaN NaN NaN No 0.717007 False
23 PercentSalaryHike 0 0.0 int64 1470.0 15.209524 3.659938 11.0 12.00 14.0 18.00 25.0 11.0 0.142857 False
24 PerformanceRating 0 0.0 int64 1470.0 3.153741 0.360824 3.0 3.00 3.0 3.00 4.0 3.0 0.846259 True
25 RelationshipSatisfaction 0 0.0 int64 1470.0 2.712245 1.081209 1.0 2.00 3.0 4.00 4.0 3.0 0.312245 False
26 StandardHours 0 0.0 int64 1470.0 80.000000 0.000000 80.0 80.00 80.0 80.00 80.0 80.0 1.0 True
27 StockOptionLevel 0 0.0 int64 1470.0 0.793878 0.852077 0.0 0.00 1.0 1.00 3.0 0.0 0.429252 False
28 TotalWorkingYears 0 0.0 int64 1470.0 11.279592 7.780782 0.0 6.00 10.0 15.00 40.0 10.0 0.137415 False
29 TrainingTimesLastYear 0 0.0 int64 1470.0 2.799320 1.289271 0.0 2.00 3.0 3.00 6.0 2.0 0.372109 False
30 WorkLifeBalance 0 0.0 int64 1470.0 2.761224 0.706476 1.0 2.00 3.0 3.00 4.0 3.0 0.607483 False
31 YearsAtCompany 0 0.0 int64 1470.0 7.008163 6.126525 0.0 3.00 5.0 9.00 40.0 5.0 0.133333 False
32 YearsInCurrentRole 0 0.0 int64 1470.0 4.229252 3.623137 0.0 2.00 3.0 7.00 18.0 2.0 0.253061 False
33 YearsSinceLastPromotion 0 0.0 int64 1470.0 2.187755 3.222430 0.0 0.00 1.0 3.00 15.0 0.0 0.395238 False
34 YearsWithCurrManager 0 0.0 int64 1470.0 4.123129 3.568136 0.0 2.00 3.0 7.00 17.0 2.0 0.234014 False

From this table we see that:

  • there is no missing values in dataset
  • dataset has several trash variables. Let's drop ['EmployeeCount', 'Over18', 'StandardHours']
  • our target variable is imbalanced (84% of employees haven't shifted from the company)

| Data Validation¶

  • Drop trash variables
  • Encode categorical variables to numerical
In [7]:
df.drop(['EmployeeCount', 'Over18', 'StandardHours'], axis=1, inplace=True)
In [8]:
from sklearn.preprocessing import LabelEncoder
map_list = []
le_name_mapping = dict()
le = LabelEncoder()
df["Attrition"] = le.fit_transform(df["Attrition"])
le_name_mapping["Attrition"] = dict(zip(le.classes_, le.transform(le.classes_)))

| EDA¶

  • target variable
  • correlation matrix with 'attrition'
  • correlation matrix
  • checking if experience (TotalWorkingYears) is a factor that affects employee from resigning
  • checking if age is a factor that affects employees from resigning
  • checking if gender is a factor that affects employees from resigning
  • checking if travel
  • checking if overtime
  • checking if monthly income
In [9]:
import plotly.express as px

Target variable¶

In [10]:
fig = px.pie(df, "Attrition", title="Attrition distribution", hole=.3)
fig.show()

our target variable is imbalanced (84% of employees haven't shifted from the company)

Correlation with target variable¶

In [11]:
attr_corr = df[(df.select_dtypes(include=np.number).columns.difference(['Attrition']))].corrwith(df["Attrition"]).sort_values(ascending=False)
df_attr_corr = pd.DataFrame(attr_corr)
In [12]:
fig = px.imshow(df_attr_corr, color_continuous_scale='Viridis')
fig.update_layout(
    height=800
)
fig.show()

Tbh there is no significant correlation between any feature with 'Attrition'

Correlation matrix¶

In [13]:
df_corr = df[(df.select_dtypes(include=np.number).columns.difference(['Attrition']))].corr()
text = df_corr.values.tolist()
fig = px.imshow(df_corr, color_continuous_scale='Viridis', aspect="auto")
fig.update_xaxes(side="top")
fig.show()

We can notice a high correlation between

  • 'JobLevel' and ['MonthlyIncome', 'TotalWorkingYears'] - Positive
  • 'TotalWorkingYears' and ['Age', 'JobLevel', 'MonthlyIncome'] - Positive
  • 'YearsAtCompany' and ['YearsInCurrentRole', 'YearsWithCurrentManager'] - Positive
  • 'YearsInCurrentRole' and 'YearsWithCurrentManager' - Positive

Experience¶

In [14]:
fig = px.histogram(df, x="TotalWorkingYears", color="Attrition")
fig.show()
In [15]:
df["TotalWorkingYears"].corr(df["Attrition"])
Out[15]:
-0.17106324613622667

Looking at the graph above we can tell that experience (TotalWorkingYears) has a very weak correlation with employee attrition so we can say that experience is not a major factor on employee attrition

Age¶

In [16]:
fig = px.histogram(df, x="Age", color="Attrition")
fig.show()
In [17]:
df["Age"].corr(df["Attrition"])
Out[17]:
-0.1592050068657797

Age is same as experience (TotalWorkingYears) as they both doesn't have strong correlations towards to employees attrition

Gender¶

In [18]:
fig = px.histogram(df, x="Gender", color="Attrition", barmode="group")
fig.show()

From the graphs above, all of them has a very weak to no correlations towards employees attrition

Travel¶

In [19]:
fig = px.histogram(df, x='BusinessTravel', color='Attrition', barmode="group")
fig.show()

Overtime¶

In [20]:
fig = px.histogram(df, x='OverTime', color='Attrition', barmode="group")
fig.show()
In [21]:
pd.crosstab(df['OverTime'], df['Gender'])
Out[21]:
Gender Female Male
OverTime
No 408 646
Yes 180 236

Montly Income¶

In [22]:
fig = px.histogram(df, x="MonthlyIncome", color="Attrition")
fig.show()

Save data for sender¶

In [23]:
df[df.columns.difference(["Attrition"])].head(2).to_csv("../data/request.csv", index=[0])

| Data Processing for Training¶

In [24]:
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction import DictVectorizer

from sklearn.metrics import roc_auc_score
from sklearn.preprocessing import MinMaxScaler

Feature Engineering¶

Label Encoder¶

In [25]:
# categorical_columns = [col for col in df.columns if df[col].dtype == 'object']
# print(categorical_columns)
# for cat_col in categorical_columns:
#     le = LabelEncoder()
#     df[cat_col] = le.fit_transform(df[cat_col])
#     le_name_mapping[cat_col] = dict(zip(le.classes_, le.transform(le.classes_)))
In [26]:
# le_name_mapping

Train-test Split¶

In [27]:
df_train, df_test = train_test_split(df, test_size=.2, random_state=1)
In [28]:
y_train, y_test = df_train['Attrition'], df_test['Attrition']
for df_ in [df_train, df_test]:
    del df_['Attrition'] 
y = y_train.append(y_test, ignore_index = True)

One-hot encoding¶

In [29]:
train_dict = df_train.to_dict(orient='records')
test_dict = df_test.to_dict(orient='records')
In [30]:
# sparse - is a matrix that is comprised of mostly zero values.
dv = DictVectorizer(sparse=False)
dv.fit(train_dict)
Out[30]:
DictVectorizer(sparse=False)
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
DictVectorizer(sparse=False)
In [31]:
X_train = dv.transform(train_dict)
X_test = dv.transform(test_dict)
X = np.concatenate((X_train, X_test))
In [32]:
dv.get_feature_names()
C:\Users\User\Anaconda\lib\site-packages\sklearn\utils\deprecation.py:87: FutureWarning:

Function get_feature_names is deprecated; get_feature_names is deprecated in 1.0 and will be removed in 1.2. Please use get_feature_names_out instead.

Out[32]:
['Age',
 'BusinessTravel=Non-Travel',
 'BusinessTravel=Travel_Frequently',
 'BusinessTravel=Travel_Rarely',
 'DailyRate',
 'Department=Human Resources',
 'Department=Research & Development',
 'Department=Sales',
 'DistanceFromHome',
 'Education',
 'EducationField=Human Resources',
 'EducationField=Life Sciences',
 'EducationField=Marketing',
 'EducationField=Medical',
 'EducationField=Other',
 'EducationField=Technical Degree',
 'EmployeeNumber',
 'EnvironmentSatisfaction',
 'Gender=Female',
 'Gender=Male',
 'HourlyRate',
 'JobInvolvement',
 'JobLevel',
 'JobRole=Healthcare Representative',
 'JobRole=Human Resources',
 'JobRole=Laboratory Technician',
 'JobRole=Manager',
 'JobRole=Manufacturing Director',
 'JobRole=Research Director',
 'JobRole=Research Scientist',
 'JobRole=Sales Executive',
 'JobRole=Sales Representative',
 'JobSatisfaction',
 'MaritalStatus=Divorced',
 'MaritalStatus=Married',
 'MaritalStatus=Single',
 'MonthlyIncome',
 'MonthlyRate',
 'NumCompaniesWorked',
 'OverTime=No',
 'OverTime=Yes',
 'PercentSalaryHike',
 'PerformanceRating',
 'RelationshipSatisfaction',
 'StockOptionLevel',
 'TotalWorkingYears',
 'TrainingTimesLastYear',
 'WorkLifeBalance',
 'YearsAtCompany',
 'YearsInCurrentRole',
 'YearsSinceLastPromotion',
 'YearsWithCurrManager']

Feature Importance¶

In [33]:
numerical_vars = list(df[df.columns.difference(["Attrition"])].select_dtypes(include=np.number).columns)
In [34]:
aucs = dict()
for var in numerical_vars:
    auc_ = roc_auc_score(y_train, df_train[var])
    if auc_ < 0.5:
        auc_ = roc_auc_score(y_train, -df_train[var])
    aucs[var] = auc_    
In [35]:
sorted(aucs.items(), key=lambda x: x[1])
Out[35]:
[('MonthlyRate', 0.501490505034657),
 ('Education', 0.5064663263533618),
 ('HourlyRate', 0.5100384953743913),
 ('PerformanceRating', 0.510694093453545),
 ('EmployeeNumber', 0.5113496915326987),
 ('RelationshipSatisfaction', 0.514397942430644),
 ('NumCompaniesWorked', 0.5167317595243832),
 ('TrainingTimesLastYear', 0.5218028386836487),
 ('PercentSalaryHike', 0.5256747897323254),
 ('YearsSinceLastPromotion', 0.5261370704291646),
 ('DailyRate', 0.5464466023769633),
 ('JobSatisfaction', 0.5560480323652521),
 ('WorkLifeBalance', 0.56864448092882),
 ('EnvironmentSatisfaction', 0.5697231358881112),
 ('JobInvolvement', 0.5741946509920823),
 ('DistanceFromHome', 0.5742198663028191),
 ('StockOptionLevel', 0.6228742092198383),
 ('Age', 0.6255666440662772),
 ('YearsWithCurrManager', 0.6292396743302534),
 ('JobLevel', 0.6324560272997765),
 ('YearsInCurrentRole', 0.6384432627491413),
 ('TotalWorkingYears', 0.6402727736281469),
 ('YearsAtCompany', 0.6428167183113587),
 ('MonthlyIncome', 0.6477112902954674)]

Feature Selection¶

Feature Scaling¶

In [36]:
min_max = MinMaxScaler()
min_max.fit(X_train)
X_train_scaled = min_max.transform(X_train)
X_test_scaled = min_max.transform(X_test)
X_scaled = np.concatenate((X_train_scaled, X_test_scaled))

| Modelling¶

In [37]:
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
# from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier

from catboost import CatBoostClassifier
from xgboost import XGBClassifier

from sklearn.model_selection import KFold
from sklearn.metrics import f1_score, precision_score, recall_score, precision_recall_curve
In [38]:
log_clf = LogisticRegression()
svc_clf = SVC()
# knn_clf = KNeighborsClassifier()
dt_clf = DecisionTreeClassifier()
rf_clf = RandomForestClassifier()
cat_clf = CatBoostClassifier()
xgb_clf = XGBClassifier()
In [ ]:
kfold = KFold(n_splits=5, shuffle=True, random_state=1)
scores = dict()

for clf in [log_clf, svc_clf, dt_clf, rf_clf, cat_clf, xgb_clf]:
    clf_list = []
    for train_idxs, test_idxs in kfold.split(X):
        X_train, X_test = X[train_idxs], X[test_idxs]
        y_train, y_test = y[train_idxs], y[test_idxs]

        clf.fit(X_train, y_train)
        y_pred = clf.predict(X_test)
        clf_list.append(roc_auc_score(y_test, y_pred))
    scores[clf.__class__.__name__] = clf_list
In [ ]:
for key in scores.keys():
    print(f'{key}' + ' ' + f'{np.mean(scores[key]):.3f} +- {np.std(scores[key]):.3f}')
In [ ]:
kfold = KFold(n_splits=5, shuffle=True, random_state=1)
scores = dict()

for clf in [log_clf, svc_clf, dt_clf, rf_clf, cat_clf, xgb_clf]:
    clf_list = []
    for train_idxs, test_idxs in kfold.split(X_scaled):
        X_train_scaled, X_test_scaled = X_scaled[train_idxs], X_scaled[test_idxs]
        y_train_scaled, y_test_scaled = y[train_idxs], y[test_idxs]

        clf.fit(X_train_scaled, y_train_scaled)
        y_pred = clf.predict(X_test_scaled)
        clf_list.append(roc_auc_score(y_test_scaled, y_pred))
    scores[clf.__class__.__name__] = clf_list
In [ ]:
for key in scores.keys():
    print(f'{key}' + ' ' + f'{np.mean(scores[key]):.3f} +- {np.std(scores[key]):.3f}')

This is reason why Feature Scaling is used. by scaling down the features we are able to achieve better accuracy

Performance tuning (hyperparameters)¶

In [ ]:
from sklearn.model_selection import RandomizedSearchCV

LR¶

In [ ]:
solvers = ['newton-cg', 'lbfgs', 'liblinear', 'sag', 'saga']

penalty = ['none', 'l1', 'l2', 'elasticnet']

c_values = [100, 10, 1.0, 0.1, 0.01]

lr_grid = dict(solver=solvers, penalty=penalty, C=c_values)
print(lr_grid)
In [ ]:
lr_random = RandomizedSearchCV(log_clf, param_distributions=lr_grid, scoring='roc_auc', cv=5, n_iter=20,
                               random_state=1, n_jobs=-1, verbose=1)
lr_random.fit(X_train_scaled, y_train_scaled)
In [ ]:
y_pred_lr = lr_random.predict_proba(X_test_scaled)[:, 1]
In [ ]:
print(lr_random.best_score_)
print(lr_random.best_params_)

KNN¶

In [ ]:
# n_neighbors = [int(x) for x in np.arange(1,22,1)]

# metric = ['eucledian', 'manhattan', 'minkowski']

# weights = ['uniform', 'distance']

# knn_grid = dict(n_neighbors=n_neighbors, weights=weights, metric=metric)
# print(knn_grid)
In [ ]:
# knn_random = RandomizedSearchCV(knn_clf, param_distributions=knn_grid, scoring='roc_auc', cv=5, n_iter=20,
#                                random_state=1, n_jobs=-1, verbose=1)
# knn_random.fit(X_train_scaled, y_train_scaled)
In [ ]:
# y_pred_knn = knn_random.predict_proba(X_test_scaled)[:, 1]
In [ ]:
# print(knn_random.best_score_)
# print(knn_random.best_params_)

SVC¶

In [ ]:
kernel = ['poly', 'rbf', 'sigmoid']
C = [50, 10, 1.0, 0.1, 0.01]
gamma = ['scale']

svc_grid = dict(kernel=kernel,C=C,gamma=gamma)
print(svc_grid)
In [ ]:
svc_random = RandomizedSearchCV(svc_clf, param_distributions=svc_grid, scoring='roc_auc', cv=5, n_iter=20,
                               random_state=1, n_jobs=-1, verbose=1)
svc_random.fit(X_train_scaled, y_train_scaled)
In [ ]:
print(svc_random.best_score_)
print(svc_random.best_params_)

RF¶

In [ ]:
#number of trees in forest
n_estimators = [int(x) for x in np.linspace(100,1200,12)]
#max depth of tree
max_depth = [int(x) for x in np.linspace(5,30,6)]
#quality of split
criterion = ['gini','entropy']
#min no. of samples to consider for splitting a internal node
min_samples_split = [2,5,7,10]
#min number of node can be as leaf node
min_samples_leaf = [2,5,8]
# The number of features to consider when looking for the best split:
max_features = ["auto","sqrt"]

random_grid = dict(n_estimators=n_estimators, max_depth=max_depth, criterion=criterion,
                  min_samples_split=min_samples_split,min_samples_leaf=min_samples_leaf,
                   max_features=max_features)
print(random_grid)
In [ ]:
rf_random = RandomizedSearchCV(rf_clf, param_distributions=random_grid, scoring='roc_auc', cv=5, n_iter=20,
                               random_state=1, n_jobs=-1, verbose=1)

rf_random.fit(X_train_scaled, y_train_scaled)
In [ ]:
y_pred_rf = rf_random.predict_proba(X_test_scaled)[:, 1]
In [ ]:
print(rf_random.best_score_)
print(rf_random.best_params_)

CatB¶

In [ ]:
n_estimators = [int(x) for x in np.linspace(100, 1000, 10)]

max_depth = [int(x) for x in np.linspace(6, 30, 5)]

learning_rate = [x for x in np.arange(0.001, 0.4, 0.001)]

l2_leaf_reg = [x for x in np.arange(0, 4, 0.1)]

min_data_in_leaf = np.random.randint(20, 200, 5)

cat_grid = dict(n_estimators=n_estimators, max_depth=max_depth, learning_rate=learning_rate,
                l2_leaf_reg=l2_leaf_reg, min_data_in_leaf=min_data_in_leaf)
In [ ]:
# cat_random = RandomizedSearchCV(cat_clf, param_distributions=cat_grid, cv=5, random_state=1,
#                                 n_iter=20, scoring="roc_auc", n_jobs=-1, verbose=1)
# cat_random.fit(X_train_scaled,y_train_scaled)
In [ ]:
# y_pred_cat = cat_random.predict_proba(X_test_scaled)[:, 1]
In [ ]:
# print("score: ", cat_random.best_score_)
# print("best_params: \n", cat_random.best_params_)

XGB¶

In [ ]:
n_estimators = [int(x) for x in np.linspace(100, 1000, 10)]

max_depth = [int(x) for x in np.linspace(6, 30, 5)]

learning_rate = [x for x in np.arange(0.001, 0.4, 0.001)]

# how many observations we need to have in a leaf node
min_child_weight = list(range(1, 10))

xg_grid = dict(n_estimators=n_estimators, max_depth=max_depth, learning_rate=learning_rate,
               min_child_weight=min_child_weight)
print(xg_grid)
In [ ]:
xgb_random = RandomizedSearchCV(xgb_clf, param_distributions=xg_grid, cv=5, random_state=1,
                                n_iter=20, scoring="roc_auc", n_jobs=-1, verbose=1)
xgb_random.fit(X_train_scaled, y_train_scaled)
In [ ]:
y_pred_xgb = xgb_random.predict_proba(X_test_scaled)[:, 1]
In [ ]:
print("score: ",xgb_random.best_score_)
print("best_params: \n",xgb_random.best_params_)

Create a confusion matrix¶

In [ ]:
thresholds = np.linspace(0, 1, 101)
In [ ]:
def confusion_matrix_dataframe(y_val, y_pred):
    scores = []
    for t in thresholds:
        actual_positive = (y_val == 1)
        actual_negative = (y_val == 0)

        predict_positive = (y_pred >= t)
        predict_negative = (y_pred < t)

        tp = (predict_positive & actual_positive).sum()
        tn = (predict_negative & actual_negative).sum()

        fp = (predict_positive & actual_negative).sum()
        fn = (predict_negative & actual_positive).sum()

        tpr = tp/(tp + fn)
        fpr = fp/(fp + tn)
        
        precision = tp / (tp + fp)
        recall = tp / (tp + fn)
        
        f1 = 2 * precision * recall / (precision+recall)
        
        gmean = np.sqrt(tpr * (1-fpr))
        
        auc = (1+tpr-fpr)/2
        
        scores.append((t, tp, fp, fn, tn, tpr, fpr, precision, recall, f1, gmean, auc))

    columns = ['threshold', 'tp', 'fp', 'fn', 'tn', 'tpr', 'fpr', 'precision', 'recall', 'f1', 'gmean', 'auc']
    df_scores = pd.DataFrame(scores, columns=columns)
    
    return df_scores
In [ ]:
df_scores = confusion_matrix_dataframe(y_test_scaled, y_pred_lr)
df_scores.head()

Find threshold¶

In [ ]:
import plotly.graph_objects as go
In [ ]:
# locate the index of the largest g-mean
ix = np.argmax(df_scores.gmean)
print('Best Threshold=%f, G-Mean=%.3f' % (df_scores.threshold[ix], df_scores.gmean[ix]))
In [ ]:
df_scores[df_scores.precision==df_scores.recall]
In [ ]:
fig = go.Figure()
fig.add_trace(go.Scatter(x=df_scores.threshold, y=df_scores.precision,
                    mode='lines',
                    name='precision'))
fig.add_trace(go.Scatter(x=df_scores.threshold, y=df_scores.recall,
                    mode='lines',
                    name='recall'))
fig.add_vline(0.26, line_dash="dot")
fig.show()
In [ ]:
fig = go.Figure()
fig.add_trace(go.Scatter(x=df_scores.threshold, y=df_scores.f1,
                    mode='lines',
                    name='lines'))
fig.add_vline(0.25, line_dash="dot")
fig.show()
In [ ]:
fig = go.Figure()
fig.add_trace(go.Scatter(x=df_scores.fpr, y=df_scores.tpr,
                    mode='lines'))
fig.update_layout(
    title="ROC Curve",
    xaxis_title="False Positive Rate",
    yaxis_title="True Positive Rate",
)
fig.add_vline(0.15, line_dash="dot")
fig.show()

| Make html¶

In [ ]:
!jupyter nbconvert --to html "EDA, Modelling + Tuning.ipynb"